import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#import data
final = pd.read_csv("/Users/galenhancock/Desktop/Capstone/final_preprocessed3.csv", encoding='latin1', header=0)
final.info()
final = final.drop(['Breed', 'Birthday', 'Unnamed: 0', 'ClientNumber', 'Client_UNID', 'Client_UNID_Date', 'AssignDate', 'CreateDate', 'GivenDate', 'PetNumber', 'practice_client_UNID', 'tagId'], axis=1)
final.head()
final = pd.get_dummies(final, columns = ['Species'])
columns_ordered = ['UNID','CM_status', 'CareManagement', 'CareManagement_prev',
'Count_of_pets', 'Dental', 'Dental_prev', 'Dental_status',
'FleaTick', 'FleaTick_prev', 'Fleatick_status', 'HasMicrochip',
'Heartwork_status', 'Heartworm', 'Heartworm_prev', 'Microchip',
'Microchip_prev', 'Microchip_status', 'Num_vaccines_bought',
'PodioId', 'PracticeDoctorID', 'displayid', 'Age', 'Species_Bovine',
'Species_Camelid', 'Species_Canine', 'Species_Caprin',
'Species_Cavia', 'Species_Equine', 'Species_Feline',
'Species_Lagomo', 'Species_Lagomorp', 'Species_Mustelid',
'Species_Pocket P', 'Species_Porcine', 'Species_Poultry',
'Species_Reptile', 'Species_Rodent', 'content_Charity',
'content_Cophraphagia', 'content_Dental', 'content_Flea',
'content_General', 'content_Heartworm', 'content_Insurance',
'content_Laser', 'content_Microchip', 'content_Nutrition',
'content_Senior', 'content_Surgery', 'content_Weight',
'content_Youth', 'media_Adoptable', 'media_Biography',
'media_EduSponsored', 'media_EduUnsponsored', 'media_Funny',
'media_Practice', 'media_SponsorStatic', 'media_SponsorVid',
'media_Support', 'media_Welcome', 'species_All', 'species_Cat',
'species_Dog', 'venue_WaitingRoom', 'media_type_image',
'media_type_video']
final = final[columns_ordered]
#get sum of tags across each umbrella category, so that the percentage of each tag seen in each category can be calculated.
#For example, in some cases, 20% of the media's content was dental related, while the remaining 80% was distributed across all 13 other content tags it could possess.
final['total_content_tags'] = final.iloc[:,38:52].sum(axis=1)
final['total_media_tags'] = final.iloc[:, 52:62].sum(axis=1)
final['total_species_tags'] = final.iloc[:, 62:65].sum(axis=1)
final['total_mediatype_tags'] = final.iloc[:, 66:68].sum(axis=1)
final['total_ad_tags'] = final.iloc[:, 38:66].sum(axis=1)
final['Count_of_pets'] = final['Count_of_pets'].astype(int)
final.head()
#standardize these columns
#average of ad tags in each category
final['content_Charity'] = (final['content_Charity']/final['total_content_tags'])*100
final['content_Cophraphagia'] = (final['content_Cophraphagia']/final['total_content_tags'])*100
final['content_Dental'] = (final['content_Dental']/final['total_content_tags'])*100
final['content_Flea'] = (final['content_Flea']/final['total_content_tags'])*100
final['content_General'] = (final['content_General']/final['total_content_tags'])*100
final['content_Heartworm'] = (final['content_Heartworm']/final['total_content_tags'])*100
final['content_Insurance'] = (final['content_Insurance']/final['total_content_tags'])*100
final['content_Laser'] = (final['content_Laser']/final['total_content_tags'])*100
final['content_Microchip'] = (final['content_Microchip']/final['total_content_tags'])*100
final['content_Nutrition'] = (final['content_Nutrition']/final['total_content_tags'])*100
final['content_Senior'] = (final['content_Senior']/final['total_content_tags'])*100
final['content_Surgery'] = (final['content_Surgery']/final['total_content_tags'])*100
final['content_Weight'] = (final['content_Weight']/final['total_content_tags'])*100
final['content_Youth'] = (final['content_Youth']/final['total_content_tags'])*100
final['media_Adoptable'] = (final['media_Adoptable']/final['total_media_tags'])*100
final['media_Biography'] = (final['media_Biography']/final['total_media_tags'])*100
final['media_EduSponsored'] = (final['media_EduSponsored']/final['total_media_tags'])*100
final['media_EduUnsponsored'] = (final['media_EduUnsponsored']/final['total_media_tags'])*100
final['media_Funny'] = (final['media_Funny']/final['total_media_tags'])*100
final['media_Practice'] = (final['media_Practice']/final['total_media_tags'])*100
final['media_SponsorStatic'] = (final['media_SponsorStatic']/final['total_media_tags'])*100
final['media_SponsorVid'] = (final['media_SponsorVid']/final['total_media_tags'])*100
final['media_Support'] = (final['media_Support']/final['total_media_tags'])*100
final['media_Welcome'] = (final['media_Welcome']/final['total_media_tags'])*100
final['species_All'] = (final['species_All']/final['total_species_tags'])*100
final['species_Cat'] = (final['species_Cat']/final['total_species_tags'])*100
final['species_Dog'] = (final['species_Dog']/final['total_species_tags'])*100
final['venue_WaitingRoom'] = (final['venue_WaitingRoom']/final['total_ad_tags'])*100
final['media_type_image'] = (final['media_type_image']/final['total_mediatype_tags'])*100
final['media_type_video'] = (final['media_type_video']/final['total_mediatype_tags'])*100
change_mapping = {'unchanged': 1,
'worse': 0,
'better': 2}
status_mapping = {'OK' : 2,
'CAUTION': 1,
'BAD': 0}
final['CM_status'] = final['CM_status'].map(change_mapping)
final['Dental_status'] = final['Dental_status'].map(change_mapping)
final['Heartwork_status'] = final['Heartwork_status'].map(change_mapping)
final['Microchip_status'] = final['Microchip_status'].map(change_mapping)
final['Fleatick_status'] = final['Fleatick_status'].map(change_mapping)
final['CareManagement'] = final['CareManagement'].map(status_mapping)
final['Dental'] = final['Dental'].map(status_mapping)
final['Heartworm'] = final['Heartworm'].map(status_mapping)
final['Microchip'] = final['Microchip'].map(status_mapping)
final['FleaTick'] = final['FleaTick'].map(status_mapping)
final['CareManagement_prev'] = final['CareManagement_prev'].map(status_mapping)
final['Dental_prev'] = final['Dental_prev'].map(status_mapping)
final['Heartworm_prev'] = final['Heartworm_prev'].map(status_mapping)
final['Microchip_prev'] = final['Microchip_prev'].map(status_mapping)
final['FleaTick_prev'] = final['FleaTick_prev'].map(status_mapping)
#sum current columns, and previous columns
final['sum_prev'] = final['CareManagement_prev'] + final['Dental_prev'] + final['FleaTick_prev'] + final['Heartworm_prev'] + final['Microchip']
final['sum_current'] = final['CareManagement'] + final['Dental'] + final['FleaTick'] + final['Heartworm'] + final['Microchip']
final['difference_before_after'] = final['sum_current'].astype(int) - final['sum_prev'].astype(int)
final.difference_before_after.describe()
final.to_csv('final_with_integers.csv')
#not truly representative and will potentially render conclusions meaningless
final.difference_before_after.value_counts()
#build a representative datasetnew_df['all_ok'] = np.nan so that pets are at least 'OK' in one category
#dataset has to represent change as well as instances of all possible scorecard rankings for each of the below scorecard categories
CM_OK = final.loc[final['CareManagement'] == 2]
DENTAL_OK = final.loc[final['Dental'] == 2]
FLEATICK_OK = final.loc[final['FleaTick'] == 2]
MICROCHIP_OK = final.loc[final['Microchip'] == 2]
HEARTWORM_OK = final.loc[final['Heartworm'] == 2]
OK_df = CM_OK.append([DENTAL_OK, FLEATICK_OK, MICROCHIP_OK, HEARTWORM_OK])
OK_df = OK_df.drop_duplicates(keep='first')
print(OK_df.difference_before_after.value_counts())
len(OK_df)
print(OK_df.Heartworm.value_counts())
print(OK_df.FleaTick.value_counts())
print(OK_df.Dental.value_counts())
print(OK_df.CareManagement.value_counts())
print(OK_df.Microchip.value_counts())
OK_df.describe()
#with the OK_df dataframe, we have a representation of compliant and under-compliant pets
OK_df.difference_before_after.describe()
OK_df.sum_current.value_counts()
cols = ['Age',
'Count_of_pets',
'Num_vaccines_bought',
'Species_Canine',
'Species_Feline',
'content_Charity',
'content_Cophraphagia',
'content_Dental',
'content_Flea',
'content_General',
'content_Heartworm',
'content_Insurance',
'content_Laser',
'content_Microchip',
'content_Nutrition',
'content_Senior',
'content_Surgery',
'content_Weight',
'content_Youth',
'media_Biography',
'media_EduSponsored',
'media_EduUnsponsored',
'media_Practice',
'media_SponsorStatic',
'media_SponsorVid',
'media_Support',
'media_Welcome',
'media_Funny',
'species_All',
'species_Cat',
'species_Dog',
'venue_WaitingRoom',
'media_type_image',
'media_type_video',
'Dental',
'Dental_prev',
'Heartworm',
'Heartworm_prev',
'Microchip',
'Microchip_prev',
'FleaTick',
'FleaTick_prev',
'CareManagement',
'CareManagement_prev']
#'something_changed_better',
#'something_changed_worse']
#Pearson product-moment correlation coefficients
cm = np.corrcoef(OK_df[cols].values.T)
# Get and show our heat map
plt.figure(figsize=(30,30))
sns.set(font_scale = 2)
hm = sns.heatmap(cm,
cbar=True,
annot=True,
square=True,
fmt='.1f',
annot_kws={'size': 10},
yticklabels=cols,
xticklabels=cols)
plt.tight_layout()
plt.show()
figure = hm.get_figure()
figure.savefig('CorrelationMatrix.png', dpi=400)
#sns.plot.savefig("CorrelationMatrix.png")
#features = attributes of owner, pet, current scorecard, scorecard goal
#targets = types of content that should be shown
#our dataset must have instances of when something changed for the better, because that is our goal
all_cols_list = list(OK_df.columns.values)
print(all_cols_list)
#fitting different models for different ad types
#first is for distribution of content
#second is for distribution of media
#third is for distribution of species
#fourth is for distribution of media_type
X_cols_all = [
'Age',
'Dental',
'Heartworm',
'Dental_prev',
'Heartworm_prev',
'Microchip',
'Microchip_prev',
'FleaTick',
'FleaTick_prev',
'CareManagement',
'CareManagement_prev',
'Count_of_pets',
'Species_Canine',
'HasMicrochip']
X_cols_some = [
'Dental',
'Heartworm',
'Microchip',
'FleaTick',
'CareManagement']
Y_cols_all = ['content_Charity',
'content_Cophraphagia',
'content_Dental',
'content_Flea',
'content_General',
'content_Heartworm',
'content_Insurance',
'content_Laser',
'content_Microchip',
'content_Nutrition',
'content_Senior',
'content_Surgery',
'content_Weight',
'content_Youth',
'media_Biography',
'media_EduSponsored',
'media_EduUnsponsored',
'media_Practice',
'media_SponsorStatic',
'media_SponsorVid',
'media_Support',
'media_Welcome',
'species_All',
'species_Cat',
'species_Dog',
'media_type_video',
'media_type_image']
content = ['content_Charity',
'content_Cophraphagia',
'content_Dental',
'content_Flea',
'content_General',
'content_Heartworm',
'content_Insurance',
'content_Laser',
'content_Microchip',
'content_Nutrition',
'content_Senior',
'content_Surgery',
'content_Weight',
'content_Youth']
Y_col_Heartworm = ['Heartworm']
Y_col_Dental = ['Dental']
media = ['media_Biography',
'media_EduSponsored',
'media_EduUnsponsored',
'media_Practice',
'media_SponsorStatic',
'media_SponsorVid',
'media_Support',
'media_Welcome',
'media_Funny']
#species_All not being used due to high correlation with species_Dog
species = [
'species_Cat',
'species_Dog']
#media_type_video eliminated due to high correlation with media_type_image
medtype = ['media_type_image', 'media_type_video']
KMeans_content = ['content_Charity',
'content_Cophraphagia',
'content_Dental',
'content_Flea',
'content_General',
'content_Heartworm',
'content_Insurance',
'content_Laser',
'content_Microchip',
'content_Nutrition',
'content_Senior',
'content_Surgery',
'content_Weight',
'content_Youth',
'Heartworm']
#get two arrays of features for testing, one with more explanatory variables included.
X = OK_df[X_cols_all].values
X_some = OK_df[X_cols_some].values
Heartworm_outcome = OK_df['Heartworm'].values
FleaTick_outcome = OK_df['FleaTick'].values
Dental_outcome = OK_df['Dental'].values
sum_current_outcome = OK_df['sum_current'].values
difference = OK_df['difference_before_after'].values
#Create a Multinomial Logistic Regressor to classify Scorecard ratings based on input variables:
#This implementation is from sci-kit learn. We will use it to observe the model coefficients of the X variables (rate of ad type seen in the room during the course of the appointment)
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
#input into this function is the list of X columns desired and the y array
def classify_by_multilog(X_cols, y):
X_train, X_test, y_train, y_test = train_test_split(OK_df[X_cols].values, y,
train_size=0.7)
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')
model = logit.fit(X_train, y_train)
new_y = model.predict(X_test)
pred = model.predict_proba(X_test)
for x in range(len(model.coef_)):
model_class = model.classes_[x]
objects = X_cols
y_pos = np.arange(len(objects))
performance = model.coef_[x]
plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects, rotation = 'vertical')
plt.ylabel('Value')
plt.title('Coefficients of Independent Variables for Y = %i' %model_class)
plt.show()
print('Model Score: ', model.score(X_test, y_test))
print('Log Loss: ', log_loss(y_test, pred))
print(classification_report(y_test, new_y))
Below are visuals indicating the model coefficients when Y = 0 , 1, 2 for each Scorecard category of Heartworm, Dental, and Flea/Tick. There are four models, each with different features corresponding with media attributes from the "Content" type of tag, the "Media theme" type of tag, the "Media Type" type of tag, and the "Species Type" type of tag.
#Classify Heartworm outcomes based on content ad tags
classify_by_multilog(content, Heartworm_outcome)
#Classify Dental outcome based on content ad tags
classify_by_multilog(content, Dental_outcome)
#Classify Flea/Tick outcome based on content ad tags
classify_by_multilog(content, FleaTick_outcome)
#Classify Heartworm outcome based on species ad tags
classify_by_multilog(species, Heartworm_outcome)
#Classify Dental outcome based on species ad tags
classify_by_multilog(species, Dental_outcome)
#Classify Flea/tick outcome based on species ad tags
classify_by_multilog(species, FleaTick_outcome)
#Classify Heartworm outcome based on media ad tags
classify_by_multilog(media, Heartworm_outcome)
#Classify Dental outcome based on media ad tags
classify_by_multilog(media, Dental_outcome)
#Classify Flea/tick outcome based on media ad tags
classify_by_multilog(media, FleaTick_outcome)
classify_by_multilog(medtype, Heartworm_outcome)
classify_by_multilog(medtype, Dental_outcome)
classify_by_multilog(medtype, FleaTick_outcome)
#ALL media tags at once:
cols_all = ['content_Charity',
'content_Cophraphagia',
'content_Dental',
'content_Flea',
'content_General',
'content_Heartworm',
'content_Insurance',
'content_Laser',
'content_Microchip',
'content_Nutrition',
'content_Senior',
'content_Surgery',
'content_Weight',
'content_Youth',
'media_Biography',
'media_EduSponsored',
'media_EduUnsponsored',
'media_Practice',
'media_SponsorStatic',
'media_SponsorVid',
'media_Support',
'media_Welcome',
'species_All',
'species_Dog',
'media_type_video']
classify_by_multilog(cols_all, Heartworm_outcome)
classify_by_multilog(cols_all, Dental_outcome)
classify_by_multilog(cols_all, FleaTick_outcome)
#Create function that applied RandomForestRegressor to predict the values of ad rates for each "umbrella" ad group:
def evaluate_model(X_cols_list, columns_list):
X_train, X_test, y_train, y_test = train_test_split(OK_df[X_cols_list].values, OK_df[columns_list].values,
train_size=0.6)
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
max_depth = 30
rand_for = RandomForestRegressor(max_depth = max_depth, min_samples_leaf = 3, bootstrap = True, oob_score=True, n_estimators = 50, random_state=1)
rand_for.fit(X_train, y_train)
y_rf = rand_for.predict(X_test)
example_output = y_rf[0]
import matplotlib.pyplot as plt
ad_breakdown = pd.Series(example_output, columns_list)
ad_breakdown.plot(x='Ad Type Breakdown by Percentage', kind = 'bar', rot =90, fontsize=10)
print("Example distributions based on first predicted value from Test set.")
print("Exmple input: ", X_test[0])
plt.title('Example Distribution of Ad Types Recommended')
plt.show()
y_rf_train = rand_for.predict(X_train)
print(f'MSE train score: {mean_squared_error(y_train, y_rf_train)}')
print(f'MSE test score: {mean_squared_error(y_test, y_rf)}')
print(f'R2 train score: {r2_score(y_train, y_rf_train)}')
print(f'R2 test score: {r2_score(y_test, y_rf)}')
print(f'Out-of-Bag test score: {rand_for.oob_score_}')
importances = rand_for.feature_importances_
#Display the feature importances in order of magnitude as a result of the Random Forest Regression model
f_importances = pd.Series(importances, X_cols_list)
# Sort the array in descending order of the importances
f_importances.sort_values(ascending=False, inplace=True)
# Make a bar Plot
f_importances.plot(x='Features', y='Importance', kind='bar', figsize=(16,9), rot=90, fontsize=30)
plt.title('Feature Importance from Random Forest Regressor')
plt.tight_layout()
plt.show()
Below are the outputs of the Random Forest Regression models, in which the media types are the output variables. The goal of fitting a Random Forest Regression model was to "recommend" the distribution of content to be shown based on various features on the pet owner, pet compliance history, and pet compliance goals.
#With age and Count of Pets
predict_content = evaluate_model(X_cols_all, content)
predict_media = evaluate_model(X_cols_all, media)
predict_species = evaluate_model(X_cols_all, species)
predict_medtype = evaluate_model(X_cols_all, medtype)
print(predict_content)
print(predict_media)
print(predict_species)
print(predict_medtype)
#Without age and Count of Pets
#As seen in the correlation matrix as well, Dental scores appear to be most important feature and thus a model could be built for advertisement selection in order to influence
predict_content2 = evaluate_model(X_cols_some, content)
predict_media2 = evaluate_model(X_cols_some, media)
predict_species2 = evaluate_model(X_cols_some, species)
predict_medtype2 = evaluate_model(X_cols_some, medtype)
print(predict_content2)
print(predict_media2)
print(predict_species2)
print(predict_medtype2)
Logit models below are fitted WITH 'Age' and owner's 'Count of pets' included in the feature vector.
#modeling just Score ~ Age + Count of Pets
cols = ['Age', 'Count_of_pets']
Ys = OK_df['Heartworm'].values
X = OK_df[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, Ys,
train_size=0.7)
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')
model = logit.fit(X_train, y_train)
new_y = model.predict(X_test)
pred = model.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))
print('Log Loss: ', log_loss(y_test, pred))
# model with features of Age, Count of pets, Heartworm previous score, Dental previous score, FleaTick previous score and target
# Post-appointment Heartworm scorecard value.
cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev']
Ys = OK_df['Heartworm'].values
X = OK_df[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, Ys,
train_size=0.7)
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')
model = logit.fit(X_train, y_train)
new_y = model.predict(X_test)
pred = model.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))
print('Log Loss: ', log_loss(y_test, pred))
#Age of pet, owner's count of pets, and ad content as Independent Variables to CLASSIFY outcome of Heartworm Scorecard (0, 1, 2)
cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev','content_Charity', 'content_Cophraphagia',
'content_Dental',
'content_Flea',
'content_General',
'content_Heartworm',
'content_Insurance',
'content_Laser',
'content_Microchip',
'content_Nutrition',
'content_Senior',
'content_Surgery',
'content_Weight',
'content_Youth']
Ys = OK_df['Dental'].values
X = OK_df[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, Ys,
train_size=0.7)
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')
model = logit.fit(X_train, y_train)
new_y = model.predict(X_test)
pred = model.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))
print('Log Loss: ', log_loss(y_test, pred))
#Age of pet, owner's count of pets, and ad content as Independent Variables to CLASSIFY outcome of Heartworm Scorecard (0, 1, 2)
cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev', 'media_Biography',
'media_EduSponsored',
'media_EduUnsponsored',
'media_Practice',
'media_SponsorStatic',
'media_SponsorVid',
'media_Support',
'media_Welcome']
Ys = OK_df['Dental'].values
X = OK_df[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, Ys,
train_size=0.7)
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')
model = logit.fit(X_train, y_train)
new_y = model.predict(X_test)
pred = model.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))
print('Log Loss: ', log_loss(y_test, pred))
#Age of pet, owner's count of pets, and ad content as Independent Variables to CLASSIFY outcome of Heartworm Scorecard (0, 1, 2)
cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev', 'species_All','species_Dog']
Ys = OK_df['Dental'].values
X = OK_df[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, Ys,
train_size=0.7)
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')
model = logit.fit(X_train, y_train)
new_y = model.predict(X_test)
pred = model.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))
print('Log Loss: ', log_loss(y_test, pred))
#Age of pet, owner's count of pets, and ad content as Independent Variables to CLASSIFY outcome of Heartworm Scorecard (0, 1, 2)
cols = ['Age', 'Count_of_pets', 'Heartworm_prev', 'Dental_prev', 'FleaTick_prev', 'media_type_image']
Ys = OK_df['Dental'].values
X = OK_df[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, Ys,
train_size=0.7)
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
logit = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')
model = logit.fit(X_train, y_train)
new_y = model.predict(X_test)
pred = model.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, new_y))
print('Model Score: ', model.score(X_test, y_test))
print('Log Loss: ', log_loss(y_test, pred))
#Multi output regression and RandomForestRegressor information
#https://nealde.github.io/blog/2017/06/15/Random-Forest-Tutorial/